data loading

data scaling

After checking for normalization, we scaled our data in the first place to provide the scaled data for further analysis.

3. Main questions

Question 1: Pedicted GI50-values

As we mentioned in our presentation, we want to create a model to predict GI50-values thus to predict, if Lapatinib is a good choice., The first linear model trys to predict the G-50 value under the data of the doubling time.

Fold_ChangeLap = select(Fold_Change, contains("Lapa"))
NegLogGI50Lap = NegLogGI50[9,]
means = colMeans(Fold_ChangeLap)
Fold_Changemeans = as.data.frame(t(means))

a2 = gsub(x = colnames (Fold_Changemeans), pattern = "_lapatinib_10000nM_24h", replacement = "")
colnames(Fold_Changemeans) = a2

a3 = gsub(x = a2, pattern = "X7", replacement = "7")
colnames(Fold_Changemeans) = a3


a1 = gsub(x = colnames (NegLogGI50Lap), pattern = "-", replacement = ".")
colnames(NegLogGI50Lap) = a1

c1 = rbind(a1,NegLogGI50Lap)
c2 = rbind(a3,Fold_Changemeans)

c1 = t(c1)
c2 = t(c2)

c1 =as.data.frame(c1)
c2 =as.data.frame(c2)


c3 = subset(c1, `1` %in% intersect(c1$`1`, c2$V1))
c4 = as.numeric(as.character(c3$lapatinib))
adjustedNeglogI50Lap = as.data.frame(c4)



Fold_Changemeans = as.data.frame(t(Fold_Changemeans))

combined1 = cbind(adjustedNeglogI50Lap, Fold_Changemeans)

names1 = c( "NegLogI50Lap","Fold_Changemeans")
colnames(combined1) = names1
                      
lmFold = lm(NegLogI50Lap ~ Fold_Changemeans, data = combined1)

summary(lmFold)
## 
## Call:
## lm(formula = NegLogI50Lap ~ Fold_Changemeans, data = combined1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0574 -0.4099 -0.1873  0.2076  2.0682 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.464e+00  9.539e-02  57.281   <2e-16 ***
## Fold_Changemeans 1.913e+15  7.519e+14   2.544    0.014 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6822 on 52 degrees of freedom
## Multiple R-squared:  0.1107, Adjusted R-squared:  0.09355 
## F-statistic:  6.47 on 1 and 52 DF,  p-value: 0.01398
qqnorm(lmFold$residuals, main = "Test for normaldistribution of residuals")
qqline(lmFold$residuals)

plot(combined1$NegLogI50Lap, lmFold$fitted.values, pch = 20, col = "blue", xlab = "Real values", 
     ylab = "Predicted values", main = "Comparison: real and predicted values ~ linear regression (Fold_Changemeans)")
abline(0, 1, col = "red")

cor(combined1$NegLogI50Lap,combined1$Fold_Changemeans)
## [1] 0.3326477
#Split the data (Training - Testing)

n = nrow(combined1)
rmse1 = sqrt(1/n * sum(lmFold$residuals^2))
rmse1
## [1] 0.6694461
i1.train = sample(1:nrow(combined1), 44)

dat1.train = combined1[i1.train, ]
dat1.test = combined1[-i1.train, ]

l1.train = lm(NegLogI50Lap ~ Fold_Changemeans, data = dat1.train)
summary(l1.train)
## 
## Call:
## lm(formula = NegLogI50Lap ~ Fold_Changemeans, data = dat1.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0139 -0.3949 -0.1546  0.1953  1.8922 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.439e+00  1.073e-01  50.685   <2e-16 ***
## Fold_Changemeans 1.584e+15  8.417e+14   1.882   0.0667 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6634 on 42 degrees of freedom
## Multiple R-squared:  0.0778, Adjusted R-squared:  0.05584 
## F-statistic: 3.543 on 1 and 42 DF,  p-value: 0.06673
n = nrow(dat1.train)
rmse1.train = sqrt(1/n * sum(l1.train$residuals^2))
rmse1.train
## [1] 0.6481528
pred1 = predict(l1.train, newdata = dat1.test)

n = nrow(dat1.test)
residuals = dat1.test$NegLogI50Lap - pred1
rmse1.test1 = sqrt(1/n * sum(residuals^2))
rmse1.test1
## [1] 0.7660867

The second linear model trys to predict the G-50 value under the data of the Foldchange-means.

NegLogGI50Lap = NegLogGI50[9,]

#Sort by Cellline-Name
df = arrange(Cellline_Annotation, Cell_Line_Name)
Doublingtime = cbind.data.frame (df$Cell_Line_Name, df$Doubling_Time)

c21 = as.data.frame(t(NegLogGI50Lap))

combined2 = cbind(c21, Doublingtime$`df$Doubling_Time`)
names2 = c( "NegLogI50Lap","Doubling_Time")
colnames(combined2) = names2

combined2 =na.omit(combined2)

lmDouble = lm(NegLogI50Lap ~ Doubling_Time, data = combined2)

summary(lmDouble)
## 
## Call:
## lm(formula = NegLogI50Lap ~ Doubling_Time, data = combined2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2751 -0.4124 -0.1210  0.1709  2.0784 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.147415   0.245361  20.979   <2e-16 ***
## Doubling_Time 0.010536   0.006391   1.649    0.105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6741 on 58 degrees of freedom
## Multiple R-squared:  0.04476,    Adjusted R-squared:  0.0283 
## F-statistic: 2.718 on 1 and 58 DF,  p-value: 0.1046
qqnorm(lmDouble$residuals, main = "Test for normaldistribution of residuals")
qqline(lmDouble$residuals)

plot(combined2$NegLogI50Lap, lmDouble$fitted.values, pch = 20, col = "blue", xlab = "Real values", 
     ylab = "Predicted values", main = "Comparison: real and predicted values ~ linear regression (Doubling-Time)")
abline(0, 1, col = "red")

cor(combined2$NegLogI50Lap,combined2$Doubling_Time)
## [1] 0.2115772
#Split the data (Training - Testing)

n = nrow(combined2)
rmse2 = sqrt(1/n * sum(lmDouble$residuals^2))
rmse2
## [1] 0.6627233
i2.train = sample(1:nrow(combined2), 48)

dat2.train = combined2[i2.train, ]
dat2.test = combined2[-i2.train, ]

l2.train = lm(NegLogI50Lap ~ Doubling_Time, data = dat2.train)
summary(l2.train)
## 
## Call:
## lm(formula = NegLogI50Lap ~ Doubling_Time, data = dat2.train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.22894 -0.34126 -0.07953  0.14257  2.12501 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.13037    0.26457  19.392   <2e-16 ***
## Doubling_Time  0.01006    0.00676   1.488    0.144    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6736 on 46 degrees of freedom
## Multiple R-squared:  0.04592,    Adjusted R-squared:  0.02518 
## F-statistic: 2.214 on 1 and 46 DF,  p-value: 0.1436
n = nrow(dat2.train)
rmse2.train = sqrt(1/n * sum(l2.train$residuals^2))
rmse2.train
## [1] 0.6594277
pred2 = predict(l2.train, newdata = dat2.test)

n = nrow(dat1.test)
residuals = dat2.test$NegLogI50Lap - pred2
rmse2.test = sqrt(1/n * sum(residuals^2))
rmse2.test
## [1] 0.7451312

As a last part, we did a multiple regression with both datasets to predict GI50-values.

b1 = gsub(x =Doublingtime$`df$Cell_Line_Name`, pattern = "-", replacement = ".")
Doublingtime1 =  rbind(b1,Doublingtime$`df$Doubling_Time`)
Doublingtime1 = as.data.frame(t(Doublingtime1)) 

c31 = subset(Doublingtime1, b1 %in% intersect(Doublingtime1$b1, c2$V1))
c41 = as.numeric(as.character(c31$V2))
adjustedDoubling_Time = as.data.frame(c41)

combined3 = cbind(adjustedNeglogI50Lap, Fold_Changemeans, adjustedDoubling_Time)
names3 = c( "NegLogI50Lap","Fold_Changemeans","Doubling_Time")
colnames(combined3) = names3

mlr = lm(NegLogI50Lap ~ ., data = combined3)

summary(mlr)
## 
## Call:
## lm(formula = NegLogI50Lap ~ ., data = combined3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3123 -0.3909 -0.1226  0.2003  1.8141 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.064e+00  2.655e-01  19.069   <2e-16 ***
## Fold_Changemeans 1.819e+15  7.429e+14   2.449   0.0178 *  
## Doubling_Time    1.083e-02  6.717e-03   1.612   0.1130    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6719 on 51 degrees of freedom
## Multiple R-squared:  0.1538, Adjusted R-squared:  0.1206 
## F-statistic: 4.635 on 2 and 51 DF,  p-value: 0.01415
qqnorm(mlr$residuals, main = "Test for normaldistribution of residuals")
qqline(mlr$residuals)

plot(combined3$NegLogI50Lap, mlr$fitted.values, pch = 20, col = "blue", xlab = "Real values", 
     ylab = "Predicted values" , main = "Comparison: real and predicted values ~ multiple regression")
abline(0, 1, col = "red")

#Split the data (Training - Testing)

n = nrow(combined3)
rmse3 = sqrt(1/n * sum(mlr$residuals^2))
rmse3
## [1] 0.6530072
i3.train = sample(1:nrow(combined2), 44)

dat3.train = combined3[i3.train, ]
dat3.test = combined3[-i3.train, ]

l3.train = lm(NegLogI50Lap ~ ., data = dat3.train)
summary(l3.train)
## 
## Call:
## lm(formula = NegLogI50Lap ~ ., data = dat3.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9238 -0.4782 -0.1381  0.1927  1.6350 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.789e+00  3.389e-01  14.132   <2e-16 ***
## Fold_Changemeans 1.150e+15  8.409e+14   1.368   0.1797    
## Doubling_Time    2.011e-02  8.917e-03   2.255   0.0301 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6632 on 37 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.1814, Adjusted R-squared:  0.1372 
## F-statistic:   4.1 on 2 and 37 DF,  p-value: 0.02464
n = nrow(dat3.train)
rmse3.train = sqrt(1/n * sum(l3.train$residuals^2))
rmse3.train
## [1] 0.6081835
pred3 = predict(l3.train, newdata = dat3.test)

n = nrow(dat3.test)
residuals = dat3.test$NegLogI50Lap - pred3
rmse3.test = sqrt(1/n * sum(residuals^2))
rmse3.test
## [1] 0.7577508

As you can see from the data, All three regression models are not really good.

Question 2: Erlotinib vs Lapatinib

# correlation in general 
n= as.data.frame(t(NegLogGI50))
rmv.rows = apply(n, 1, function(x) {
  sum(is.na(x))
})
NLGI50.all = n[-which(rmv.rows > 0), ]  # Removing any row with 1 or more missing values
rm(rmv.rows, n, NegLogGI50)
cor.mat = as.data.frame(cor(NLGI50.all[, 1:ncol(NLGI50.all)], method = "pearson")) #Pearson correlation
round(cor.mat, 2) #round values
##               5-Azacytidine bortezomib cisplatin dasatinib doxorubicin
## 5-Azacytidine          1.00      -0.08      0.16      0.18        0.29
## bortezomib            -0.08       1.00      0.01     -0.10        0.32
## cisplatin              0.16       0.01      1.00     -0.24        0.52
## dasatinib              0.18      -0.10     -0.24      1.00       -0.08
## doxorubicin            0.29       0.32      0.52     -0.08        1.00
## erlotinib              0.27      -0.32      0.01      0.42       -0.17
## geldanamycin           0.23       0.36      0.19     -0.09        0.23
## gemcitibine            0.16      -0.08      0.53     -0.03        0.37
## lapatinib              0.14      -0.26     -0.07      0.19       -0.16
## paclitaxel             0.10       0.20      0.01     -0.10        0.55
## sirolimus             -0.05       0.01      0.27      0.07        0.17
## sorafenib              0.09       0.27     -0.01     -0.24        0.14
## sunitinib              0.12      -0.01     -0.05     -0.03       -0.14
## topotecan              0.14       0.13      0.55      0.02        0.60
## vorinostat             0.16      -0.02      0.07     -0.16       -0.06
##               erlotinib geldanamycin gemcitibine lapatinib paclitaxel
## 5-Azacytidine      0.27         0.23        0.16      0.14       0.10
## bortezomib        -0.32         0.36       -0.08     -0.26       0.20
## cisplatin          0.01         0.19        0.53     -0.07       0.01
## dasatinib          0.42        -0.09       -0.03      0.19      -0.10
## doxorubicin       -0.17         0.23        0.37     -0.16       0.55
## erlotinib          1.00        -0.01        0.01      0.65      -0.37
## geldanamycin      -0.01         1.00        0.12     -0.01       0.28
## gemcitibine        0.01         0.12        1.00     -0.15       0.03
## lapatinib          0.65        -0.01       -0.15      1.00      -0.24
## paclitaxel        -0.37         0.28        0.03     -0.24       1.00
## sirolimus          0.21        -0.21        0.05      0.21      -0.04
## sorafenib         -0.29         0.14       -0.01     -0.25       0.29
## sunitinib          0.06         0.24        0.06      0.12      -0.02
## topotecan         -0.02         0.21        0.63     -0.14       0.20
## vorinostat         0.12         0.20        0.18      0.26       0.09
##               sirolimus sorafenib sunitinib topotecan vorinostat
## 5-Azacytidine     -0.05      0.09      0.12      0.14       0.16
## bortezomib         0.01      0.27     -0.01      0.13      -0.02
## cisplatin          0.27     -0.01     -0.05      0.55       0.07
## dasatinib          0.07     -0.24     -0.03      0.02      -0.16
## doxorubicin        0.17      0.14     -0.14      0.60      -0.06
## erlotinib          0.21     -0.29      0.06     -0.02       0.12
## geldanamycin      -0.21      0.14      0.24      0.21       0.20
## gemcitibine        0.05     -0.01      0.06      0.63       0.18
## lapatinib          0.21     -0.25      0.12     -0.14       0.26
## paclitaxel        -0.04      0.29     -0.02      0.20       0.09
## sirolimus          1.00     -0.11     -0.20      0.03       0.02
## sorafenib         -0.11      1.00      0.05      0.16       0.10
## sunitinib         -0.20      0.05      1.00     -0.05      -0.13
## topotecan          0.03      0.16     -0.05      1.00       0.08
## vorinostat         0.02      0.10     -0.13      0.08       1.00

produce pairwise scatter plots

pairs(NLGI50.all[, 1:ncol(NLGI50.all)], pch = 20, cex = 0.8, col = "royalblue3", main = "Correlation_NegLogGI50") 

matrix of correlations

cor = cor(NLGI50.all)
pheatmap(cor, col = cm.colors(256), main = "Pheatmap correlation")

Erlotinib and lapatinib have a strong correlation.

plot erlotinib all genes, coloured by tissue

#differece
diff = data.frame(erlotinib = NLGI50.all$erlotinib - mean(NLGI50.all$erlotinib), lapatinib = NLGI50.all$lapatinib- mean(NLGI50.all$lapatinib))
diff$celllines = rownames(NLGI50.all)
#create vector to insert column tissue from Metadata

tissue = sapply(1:nrow(diff), function(x) {
  position = which(as.character(Metadata$cell) == diff[x, "celllines"])[1] #if tissue occurs several times, take the first
  out = as.character(Metadata[position, "tissue"]) #output the tissue at this position
  return(out)
})
diff$tissue = tissue
rm(tissue)

diff$celllines = factor(diff$celllines, levels = diff$celllines[order(diff$tissue)]) #Classified by tissue

ggplot(diff, aes(x = celllines, y = erlotinib, fill = tissue))+geom_bar(stat = "identity") + coord_flip() + labs(title = "Mean graph plot of NLGI50 values for Erlotinib")

The difference from the NegLogGI50 for a particular cell line and the mean NegLogGI50 is plotted here for Erlotinib.

plot lapatinib all genes, coloured by tissue

ggplot(diff, aes(x = celllines, y = lapatinib, fill = tissue)) + geom_bar(stat="identity") + coord_flip() + labs(title="Mean graph plot of NLGI50 values for Lapatinib")

The difference from the NegLogGI50 for a particular cell line and the mean NegLogGI50 is plotted here for Lapatinib.

correlation erlotinib , lapatinib

cor(NLGI50.all$erlotinib, NLGI50.all$lapatinib, method = "pearson")
## [1] 0.6528188

A Pearson correlation coefficient of ~ 0.65 confirms that these patterns are similar.

Lung genes

#only lung with mean all
### load data
Metadata_Lapatinib_treated = Metadata[which(Metadata$drug == "lapatinib" & Metadata$dose != "0nM"),]
NegLogGI50 = as.data.frame(readRDS(paste0(wd, "/Data/NegLogGI50.rds")))
#lung genes from Metadata
Lung_Metadata_L_treated = Metadata[which(Metadata$drug == "lapatinib" & Metadata$dose != "0nM" & Metadata$tissue == "Lung"),] 
celllines = Lung_Metadata_L_treated$cell
NegLogGI50.lung = as.data.frame(t(NegLogGI50[c("erlotinib", "lapatinib"), celllines]))

#Difference
dif.NegLogGI50.lung = data.frame(erlotinib = NegLogGI50.lung$erlotinib -  mean(NLGI50.all$erlotinib), lapatinib = NegLogGI50.lung$lapatinib -  mean(NLGI50.all$lapatinib)) #erlotinib data - mean value, lapatinib data - mean value
dif.NegLogGI50.lung$celllines = rownames(NegLogGI50.lung)

# PLot

ggplot(dif.NegLogGI50.lung,aes(x = celllines, y = erlotinib)) + geom_bar(stat = "identity", fill = "skyblue") + geom_text(aes(label = round(erlotinib, 2)), vjust = -0.5, color = "black", size = 3) + coord_flip() + labs(title = "Mean graph plot of NLGI50 values for Erlotinib, only Lung genes")

plot lapatinib

ggplot(dif.NegLogGI50.lung,aes(x = celllines, y = lapatinib)) + geom_bar(stat = "identity", fill = "skyblue") + geom_text(aes(label=round(lapatinib, 2)), vjust = -0.5, color = "black", size = 3) + coord_flip() + labs(title = "Mean graph plot of NLGI50 values for Lapatinib, only Lung genes")

correlation lung

cor(NegLogGI50.lung$erlotinib, NegLogGI50.lung$lapatinib, method = "pearson")
## [1] 0.9609488

A pearson correlation coefficent of ~ 0.96 suggests that Lapatinib has a similar effect on lung cancer as Erlotinib

Anova

selection of Lapatinib and Erlotinib treated cells

lapa<-data.frame(Metadata[which(Metadata[,'drug'] == "lapatinib"), ])
erlo<-data.frame(Metadata[which(Metadata[,'drug'] == "erlotinib"), ])
el<-right_join(lapa,erlo, by="cell")
rmv.rows = apply(el, 1, function(x) {
  sum(is.na(x))
})  # Go through each row and sum up all missing values
row.names(rmv.rows)

Create data frame with lapatinib and erlotinib data

fc<-data.frame(Treated-Untreated)
dim(fc)
## [1] 13299   819
all<-data.frame(fc[grep("lapatinib|erlotinib", colnames(fc))])
dim(all)
## [1] 13299   113

since erlotinip contains more columns than lapatinib, we have to remove these columns

all.rmv<-data.frame(all %>% select(
                    -"CCRF.CEM_erlotinib_10000nM_24h", 
                    -"HL.60_erlotinib_10000nM_24h", 
                    -"HT29_erlotinib_10000nM_24h", 
                    -"K.562_erlotinib_10000nM_24h", 
                    -"LOX_erlotinib_10000nM_24h",
                    -"SR_erlotinib_10000nM_24h",
                    -"COLO.205_lapatinib_10000nM_24h"))
dim(all.rmv)
## [1] 13299   106

Checking the rows

la<-data.frame(all.rmv[grep("lapatinib", colnames(all.rmv))])
ncol(la)
## [1] 53
er<-data.frame(all.rmv[grep("erlotinib", colnames(all.rmv))])
ncol(er)
## [1] 53
erla<-data.frame(er,la)
ncol(all.rmv) #to prove if the columns are removed
## [1] 106

Anova

drug<-c(rep('Erlotinib',53), rep('Lapatinib',53))
expression_drug<-apply(erla, MARGIN = 2, sum)
df_drug<-data.frame(expression_drug, drug)
library(ggpubr)
ggboxplot (data = df_drug, x="drug", y="expression_drug", color = "drug",    
          add = "jitter", legend = "none")+
          rotate_x_text(angle = 45)+
          stat_compare_means(method = "anova")+        # Add global annova p-value
          stat_compare_means(label = "p.signif", method = "t.test",
          ref.group = ".all.", hide.ns = TRUE)      # Pairwise comparison against all

Question 3: Comparing lapatinib treated breast and cns celllines

L_fc <- select(Fold_Change, contains("Lapa"))
L_fc <- as.data.frame(t(L_fc))
rownames(Metadata) <- Metadata$sample


L_treated <- select(Treated, contains("Lapa"))
L_treated <- t(L_treated)
L_untreated <- select(Untreated, contains("Lapa"))
L_untreated <- t(L_untreated)



# selecting breast Lapatinib samples
breast <- Metadata[Metadata[,'tissue']=="Breast",]
rownames(breast) <- breast$sample
rownames(breast) <- gsub(x = rownames(breast), pattern = "-", replacement = ".")  

breastFC <- subset(L_fc, rownames(L_fc) %in% rownames(breast))
breastTreated <- subset(L_treated, rownames(L_treated) %in% rownames(breast))
breastUntreated <- subset(L_untreated, rownames(L_untreated) %in% rownames(breast))#


# selecting CNS Lapatinib samples
cns <- Metadata[Metadata[,'tissue']=="CNS",]
rownames(cns) <- cns$sample
rownames(cns) <- gsub(x = rownames(cns), pattern = "-", replacement = ".")

cnsFC <- subset(L_fc, rownames(L_fc) %in% rownames(cns))
cnsTreated <- subset(L_treated, rownames(L_treated) %in% rownames(cns))
cnsUntreated <- subset(L_untreated, rownames(L_untreated) %in% rownames(cns))
#performing a paired t-test of treated and untreated samples
t_test_cns <- col_t_paired(cnsTreated, cnsUntreated, alternative = "two.sided", mu = 0,conf.level = 0.95)
t_test_breast <- col_t_paired(breastTreated, breastUntreated, alternative = "two.sided", mu = 0,conf.level = 0.95)

#obtaining Benjamini-Hochberg adjusted p-values
pval_cns <- t_test_cns$pvalue
pval_breast <- t_test_breast$pvalue

fdr_cns <- p.adjust(pval_cns, "BH")
fdr_breast <- p.adjust(pval_breast, "BH")


#obtaining mean FC values over all samples 
breastFCm <- as.numeric(colMeans(breastFC))
cnsFCm <- as.numeric(colMeans(cnsFC))
genes <- colnames(breastFC)
## breast volcano plot
#creating a matrix containg all needed values for plotting
diff_df_breast <- data.frame(gene = genes, Fold = breastFCm, FDR = fdr_breast)
diff_df_breast$absFold <- abs(diff_df_breast$Fold)
head(diff_df_breast)
##     gene         Fold       FDR     absFold
## 1   A1CF  0.037268413 0.8765540 0.037268413
## 2    A2M -0.032213825 0.7188608 0.032213825
## 3 A4GALT  0.006012452 0.9793436 0.006012452
## 4  A4GNT -0.053969518 0.4235638 0.053969518
## 5   AAAS  0.081656784 0.5283372 0.081656784
## 6   AACS  0.023767096 0.8115022 0.023767096
# add a grouping column; default value is "not significant"
diff_df_breast$group <- "NotSignificant"



# change the grouping for the entries with significance but not a large enough Fold change
diff_df_breast[which(diff_df_breast['FDR'] < 0.5 & (diff_df_breast['absFold']) < 0.2 ),"group"] <- "Significant"

# change the grouping for the entries a large enough Fold change but not a low enough p value
diff_df_breast[which(diff_df_breast['FDR'] > 0.5 & (diff_df_breast['absFold']) > 0.2 ),"group"] <- "FoldChange"

# change the grouping for the entries with both significance and large enough fold change
diff_df_breast[which(diff_df_breast['FDR'] < 0.5 & (diff_df_breast['absFold']) > 0.2 ),"group"] <- "Significant&FoldChange"


# Find and label the top peaks.
top_peaks_breast <- diff_df_breast[with(diff_df_breast, order(Fold, FDR)),][1:10,]
top_peaks_breast <- rbind(top_peaks_breast, diff_df_breast[with(diff_df_breast, order(-Fold, FDR)),][1:10,])


# Add gene labels for all of the top genes we found
# creating an empty list, and filling it with entries for each row in the dataframe
# each list entry is another list with named items that will be used 
a <- list()
for (i in seq_len(nrow(top_peaks_breast))) {
  m <- top_peaks_breast[i, ]
  a[[i]] <- list(
    x = m[["Fold"]],
    y = -log10(m[["FDR"]]),
    text = m[["gene"]],
    xref = "x",
    yref = "y",
    showarrow = TRUE,
    arrowhead = 0.5,
    ax = 20,
    ay = -40
  )
}

plot_breast <- plot_ly(data = diff_df_breast, x = diff_df_breast$Fold, y = -log10(diff_df_breast$FDR), type = "scatter", text = diff_df_breast$gene, mode = "markers", color = diff_df_breast$group) %>% 
  layout(title ="Volcano Plot of Lapatinib breast cancer samples", 
         xaxis = list(title="log2 Fold Change"),
         yaxis = list(title="FDR")) %>%
  layout(annotations = a)
plot_breast
###thresholds still need to be discussed
## CNS volcano plot

diff_df_cns <- data.frame(gene = genes, Fold = cnsFCm, FDR = fdr_cns)
diff_df_cns$absFold <- abs(diff_df_cns$Fold)
head(diff_df_cns)
##     gene         Fold       FDR     absFold
## 1   A1CF  0.066575311 0.5939566 0.066575311
## 2    A2M  0.038348381 0.6873009 0.038348381
## 3 A4GALT  0.000390011 0.9980719 0.000390011
## 4  A4GNT -0.018219799 0.8780106 0.018219799
## 5   AAAS  0.014723327 0.9008420 0.014723327
## 6   AACS  0.003887384 0.9870209 0.003887384
# add a grouping column; default value is "not significant"
diff_df_cns$group <- "NotSignificant"


# change the grouping for the entries with significance but not a large enough Fold change
diff_df_cns[which(diff_df_cns['FDR'] < 0.5 & (diff_df_cns['absFold']) < 0.2 ),"group"] <- "Significant"

# change the grouping for the entries a large enough Fold change but not a low enough p value
diff_df_cns[which(diff_df_cns['FDR'] > 0.5 & (diff_df_cns['absFold']) > 0.2 ),"group"] <- "FoldChange"

# change the grouping for the entries with both significance and large enough fold change
diff_df_cns[which(diff_df_cns['FDR'] < 0.5 & (diff_df_cns['absFold']) > 0.2 ),"group"] <- "Significant&FoldChange"


# Find and label the top peaks..
top_peaks_cns <- diff_df_cns[with(diff_df_cns, order(Fold, FDR)),][1:10,]
top_peaks_cns <- rbind(top_peaks_cns, diff_df_cns[with(diff_df_cns, order(-Fold, FDR)),][1:10,])


a <- list()
for (i in seq_len(nrow(top_peaks_cns))) {
  m <- top_peaks_cns[i, ]
  a[[i]] <- list(
    x = m[["Fold"]],
    y = -log10(m[["FDR"]]),
    text = m[["gene"]],
    xref = "x",
    yref = "y",
    showarrow = TRUE,
    arrowhead = 0.5,
    ax = 20,
    ay = -40
  )
}

plot_cns <- plot_ly(data = diff_df_cns, x = diff_df_cns$Fold, y = -log10(diff_df_cns$FDR),type = "scatter", text = diff_df_cns$gene, mode = "markers", color = diff_df_cns$group) %>% 
  layout(title ="Volcano Plot of Lapatinib CNS cancer samples",
         xaxis = list(title="log2 Fold Change"),
         yaxis = list(title="FDR"))%>%
  layout(annotations = a)
plot_cns
# selecet top peak genes common in cns and breast tissue
tpb_comparison <- subset(top_peaks_breast, gene %in% top_peaks_cns$gene)
tpc_comparison <- subset(top_peaks_cns, gene %in% top_peaks_breast$gene)


# order common genes alphabetically
tpb_comparison <- tpb_comparison[order(tpb_comparison$gene),]
tpc_comparison <- tpc_comparison[order(tpc_comparison$gene),]


## creating heat map of FCs to compare values 
cor_mat <- cbind("breast" = tpb_comparison$Fold, "cns" = tpc_comparison$Fold)
rownames(cor_mat) <- tpb_comparison$gene
data <- read.delim


pheatmap(
  mat               = cor_mat,
  color             = magma(10),
  border_color      = "black",
  show_colnames     = TRUE,
  show_rownames     = TRUE,
  drop_levels       = TRUE,
  fontsize          = 14,
  main              = "Comparison:
  FC levels of cns and breast top peak genes"
)

#correlation between top peak gene expression patterns in breast and cns tissues treated by lapatinib
cor_mat <- as.data.frame(cor_mat)
dif.FC.BC = data.frame(breast = cor_mat$breast -  mean(t(breastFC)), cns = cor_mat$cns -  mean(t(cnsFC))) #breast data - mean value, cns data - mean value
dif.FC.BC$gene = rownames(cor_mat)
# PLot

ggplot(dif.FC.BC,aes(x = gene, y = breast)) +
  geom_bar(stat = "identity", fill = "skyblue") + 
  geom_text(aes(label = round(breast, 2)), vjust = -0.5, color = "black", size = 3) + 
  coord_flip() + labs(title = "Mean graph plot of Fold Change for breast top peak genes")

ggplot(dif.FC.BC,aes(x = gene, y = cns)) +
  geom_bar(stat = "identity", fill = "skyblue") + 
  geom_text(aes(label = round(cns, 2)), vjust = -0.5, color = "black", size = 3) + 
  coord_flip() + labs(title = "Mean graph plot of Fold Change for CNS top peak genes")

#correlation
cor(cor_mat$breast, cor_mat$cns, method = "pearson")
## [1] 0.921812